if (!require("ggfortify")) {
  install.packages("ggfortify")
}

if (!require("dplyr")) {
  install.packages("dplyr")
}

if (!require("qdap")) {
  install.packages("qdap")
}

if (!require("corrplot")) {
  install.packages("corrplot")
}

if (!require("caret")) {
  install.packages("caret")
}

if (!require("e1071")) {
  install.packages("e1071")
}


if (!require("caTools")) {
  install.packages("caTools")
}

if (!require("class")) {
  install.packages("class")
}

if (!require("rpart")) {
  install.packages("rpart")
}

if (!require("rpart.plot")) {
  install.packages("rpart.plot")
}

if (!require("randomForest")) {
  install.packages("randomForest")
}

if (!require("naivebayes")) {
  install.packages("naivebayes")
}

R for Marketing

Income prediction

Our dataset was initially created for marketing purposes of a company. They wanted to see which customer will respond to an offer for a product or service. While analyzing this dataset, we noticed that some clients didn’t state their income and we were interested if, given all the other variables in the dataset, it was possible to predict it.

Therefore, the aim of our project was to estimate the income of customers who did not state it in our dataset. We wanted also to see whether variables, such as education, marital status, or money spent on particular products were important for income prediction.

Before we started creating models, our data needed some pre-processing.

marketing_dat <-read.delim("marketing_campaign.csv")
str(marketing_dat) 
## 'data.frame':    2240 obs. of  29 variables:
##  $ ID                 : int  5524 2174 4141 6182 5324 7446 965 6177 4855 5899 ...
##  $ Year_Birth         : int  1957 1954 1965 1984 1981 1967 1971 1985 1974 1950 ...
##  $ Education          : chr  "Graduation" "Graduation" "Graduation" "Graduation" ...
##  $ Marital_Status     : chr  "Single" "Single" "Together" "Together" ...
##  $ Income             : int  58138 46344 71613 26646 58293 62513 55635 33454 30351 5648 ...
##  $ Kidhome            : int  0 1 0 1 1 0 0 1 1 1 ...
##  $ Teenhome           : int  0 1 0 0 0 1 1 0 0 1 ...
##  $ Dt_Customer        : chr  "04-09-2012" "08-03-2014" "21-08-2013" "10-02-2014" ...
##  $ Recency            : int  58 38 26 26 94 16 34 32 19 68 ...
##  $ MntWines           : int  635 11 426 11 173 520 235 76 14 28 ...
##  $ MntFruits          : int  88 1 49 4 43 42 65 10 0 0 ...
##  $ MntMeatProducts    : int  546 6 127 20 118 98 164 56 24 6 ...
##  $ MntFishProducts    : int  172 2 111 10 46 0 50 3 3 1 ...
##  $ MntSweetProducts   : int  88 1 21 3 27 42 49 1 3 1 ...
##  $ MntGoldProds       : int  88 6 42 5 15 14 27 23 2 13 ...
##  $ NumDealsPurchases  : int  3 2 1 2 5 2 4 2 1 1 ...
##  $ NumWebPurchases    : int  8 1 8 2 5 6 7 4 3 1 ...
##  $ NumCatalogPurchases: int  10 1 2 0 3 4 3 0 0 0 ...
##  $ NumStorePurchases  : int  4 2 10 4 6 10 7 4 2 0 ...
##  $ NumWebVisitsMonth  : int  7 5 4 6 5 6 6 8 9 20 ...
##  $ AcceptedCmp3       : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ AcceptedCmp4       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp5       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp1       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AcceptedCmp2       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Complain           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Z_CostContact      : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ Z_Revenue          : int  11 11 11 11 11 11 11 11 11 11 ...
##  $ Response           : int  1 0 0 0 0 0 0 0 1 0 ...
summary(marketing_dat)
##        ID          Year_Birth    Education         Marital_Status    
##  Min.   :    0   Min.   :1893   Length:2240        Length:2240       
##  1st Qu.: 2828   1st Qu.:1959   Class :character   Class :character  
##  Median : 5458   Median :1970   Mode  :character   Mode  :character  
##  Mean   : 5592   Mean   :1969                                        
##  3rd Qu.: 8428   3rd Qu.:1977                                        
##  Max.   :11191   Max.   :1996                                        
##                                                                      
##      Income          Kidhome          Teenhome      Dt_Customer       
##  Min.   :  1730   Min.   :0.0000   Min.   :0.0000   Length:2240       
##  1st Qu.: 35303   1st Qu.:0.0000   1st Qu.:0.0000   Class :character  
##  Median : 51382   Median :0.0000   Median :0.0000   Mode  :character  
##  Mean   : 52247   Mean   :0.4442   Mean   :0.5062                     
##  3rd Qu.: 68522   3rd Qu.:1.0000   3rd Qu.:1.0000                     
##  Max.   :666666   Max.   :2.0000   Max.   :2.0000                     
##  NA's   :24                                                           
##     Recency         MntWines         MntFruits     MntMeatProducts 
##  Min.   : 0.00   Min.   :   0.00   Min.   :  0.0   Min.   :   0.0  
##  1st Qu.:24.00   1st Qu.:  23.75   1st Qu.:  1.0   1st Qu.:  16.0  
##  Median :49.00   Median : 173.50   Median :  8.0   Median :  67.0  
##  Mean   :49.11   Mean   : 303.94   Mean   : 26.3   Mean   : 166.9  
##  3rd Qu.:74.00   3rd Qu.: 504.25   3rd Qu.: 33.0   3rd Qu.: 232.0  
##  Max.   :99.00   Max.   :1493.00   Max.   :199.0   Max.   :1725.0  
##                                                                    
##  MntFishProducts  MntSweetProducts  MntGoldProds    NumDealsPurchases
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   : 0.000   
##  1st Qu.:  3.00   1st Qu.:  1.00   1st Qu.:  9.00   1st Qu.: 1.000   
##  Median : 12.00   Median :  8.00   Median : 24.00   Median : 2.000   
##  Mean   : 37.53   Mean   : 27.06   Mean   : 44.02   Mean   : 2.325   
##  3rd Qu.: 50.00   3rd Qu.: 33.00   3rd Qu.: 56.00   3rd Qu.: 3.000   
##  Max.   :259.00   Max.   :263.00   Max.   :362.00   Max.   :15.000   
##                                                                      
##  NumWebPurchases  NumCatalogPurchases NumStorePurchases NumWebVisitsMonth
##  Min.   : 0.000   Min.   : 0.000      Min.   : 0.00     Min.   : 0.000   
##  1st Qu.: 2.000   1st Qu.: 0.000      1st Qu.: 3.00     1st Qu.: 3.000   
##  Median : 4.000   Median : 2.000      Median : 5.00     Median : 6.000   
##  Mean   : 4.085   Mean   : 2.662      Mean   : 5.79     Mean   : 5.317   
##  3rd Qu.: 6.000   3rd Qu.: 4.000      3rd Qu.: 8.00     3rd Qu.: 7.000   
##  Max.   :27.000   Max.   :28.000      Max.   :13.00     Max.   :20.000   
##                                                                          
##   AcceptedCmp3      AcceptedCmp4      AcceptedCmp5      AcceptedCmp1    
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000   Median :0.00000   Median :0.00000  
##  Mean   :0.07277   Mean   :0.07455   Mean   :0.07277   Mean   :0.06429  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000  
##                                                                         
##   AcceptedCmp2        Complain        Z_CostContact   Z_Revenue 
##  Min.   :0.00000   Min.   :0.000000   Min.   :3     Min.   :11  
##  1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:3     1st Qu.:11  
##  Median :0.00000   Median :0.000000   Median :3     Median :11  
##  Mean   :0.01339   Mean   :0.009375   Mean   :3     Mean   :11  
##  3rd Qu.:0.00000   3rd Qu.:0.000000   3rd Qu.:3     3rd Qu.:11  
##  Max.   :1.00000   Max.   :1.000000   Max.   :3     Max.   :11  
##                                                                 
##     Response     
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.1491  
##  3rd Qu.:0.0000  
##  Max.   :1.0000  
## 

After analyzing our dataset, we decided to perform the following actions:

  1. Changing AcceptedCmps, Response and Complain to factors.
  2. Removing Z_cost contact - Cost to contact a customer and Z_Revenue - Revenue after client accepting campaign. They describe company’s costs and revenues related to each client and are the same for each customer, so they won’t influence customer’s income prediction.
  3. Creating a new column: Age, which will contain age of each customer, calculated by subtracting Year_Birth - customer’s year of birth from Dt_Customer - Date of customer’s enrollment with the company. The column will therefore describe the age of a customer at the moment he enrolled with the company.
  4. Splitting elements of Education column into new (1|0) columns.
  5. Splitting elements of Relationship Status column into new (1|0) columns.
  6. Creating a separate dataset for rows with no income.

Below you will find an R code for each step.

1. Changing AcceptedCmps, Response and Complain to factors.

marketing_dat$AcceptedCmp1 <- factor(marketing_dat$AcceptedCmp1)
marketing_dat$AcceptedCmp2 <- factor(marketing_dat$AcceptedCmp2)
marketing_dat$AcceptedCmp3 <- factor(marketing_dat$AcceptedCmp3)
marketing_dat$AcceptedCmp4 <- factor(marketing_dat$AcceptedCmp4)
marketing_dat$AcceptedCmp5 <- factor(marketing_dat$AcceptedCmp5)
marketing_dat$Response <- factor(marketing_dat$Response)
marketing_dat$Complain <- factor(marketing_dat$Complain)

2. Removing Z_cost contact and Z_Revenue

marketing_dat = subset(marketing_dat, select = -c(Z_CostContact,Z_Revenue))

3. Creating a new column: Age

marketing_dat$Year_Birth = as.numeric(marketing_dat$Year_Birth) #so we can subtract it later
marketing_dat$Dt_Customer = as.Date(marketing_dat$Dt_Customer, format = "%Y-%m-%d") #first it needs to be a date
marketing_dat$Dt_Customer = format(marketing_dat$Dt_Customer, "%Y") #we leave only a year, rest is irrelevant
marketing_dat$Dt_Customer = as.numeric(marketing_dat$Dt_Customer) #finally date is numeric
marketing_dat$Age= (marketing_dat$Dt_Customer-as.numeric(marketing_dat$Year_Birth)) # subtracting and assigning results to a new column: Age

marketing_dat <- marketing_dat %>% relocate(Age, .before = Education) #we want to have the Age column where Year_Birth column is

marketing_dat = subset(marketing_dat, select = -c(Year_Birth,Dt_Customer)) #Removing Year_Birth and Dt_Customer

4. Splitting elements of Education column into new (1|0) columns.

marketing_dat = cbind(marketing_dat,mtabulate(strsplit(as.character(marketing_dat$Education),'""'))) #We get 5 new columns describing customer's education level

marketing_dat = subset(marketing_dat, select = -c(Education)) #Removing the Education column

#We need the new columns to be factors
marketing_dat$"2n Cycle" <- factor(marketing_dat$"2n Cycle")
marketing_dat$Basic <- factor(marketing_dat$Basic)
marketing_dat$Graduation <- factor(marketing_dat$Graduation)
marketing_dat$Master <- factor(marketing_dat$Master)
marketing_dat$PhD <- factor(marketing_dat$PhD)

5. Splitting elements of Relationship Status column into new (1|0) columns.

marketing_dat = cbind(marketing_dat,mtabulate(strsplit(as.character(marketing_dat$Marital_Status),'""'))) #We noticed strange variable names: Absurd, Alone and YOLO and noticed that they occur no more than 3 times, so we decided to remove the rows which contained this status

marketing_dat = subset(marketing_dat, select = -c(Marital_Status, Absurd, Alone, YOLO)) 

# Creating factors
marketing_dat$Divorced <- factor(marketing_dat$Divorced)
marketing_dat$Married <- factor(marketing_dat$Married)
marketing_dat$Single <- factor(marketing_dat$Single)
marketing_dat$Together <- factor(marketing_dat$Together)
marketing_dat$Widow <- factor(marketing_dat$Widow)

6. Creating a separate dataset for rows with no income.

marketing_dat_noincome = marketing_dat[which(is.na(marketing_dat$Income)),]

summary(marketing_dat_noincome)
##        ID             Age            Income       Kidhome      
##  Min.   : 1295   Min.   :-1986   Min.   : NA   Min.   :0.0000  
##  1st Qu.: 3063   1st Qu.:-1966   1st Qu.: NA   1st Qu.:0.0000  
##  Median : 5526   Median :-1952   Median : NA   Median :1.0000  
##  Mean   : 5944   Mean   :-1953   Mean   :NaN   Mean   :0.6667  
##  3rd Qu.: 8598   3rd Qu.:-1949   3rd Qu.: NA   3rd Qu.:1.0000  
##  Max.   :10629   Max.   :-1913   Max.   : NA   Max.   :2.0000  
##                                  NA's   :24                    
##     Teenhome         Recency         MntWines       MntFruits     
##  Min.   :0.0000   Min.   : 4.00   Min.   :  5.0   Min.   :  0.00  
##  1st Qu.:0.0000   1st Qu.:35.50   1st Qu.: 22.0   1st Qu.:  1.00  
##  Median :1.0000   Median :62.00   Median : 76.0   Median :  3.50  
##  Mean   :0.5833   Mean   :58.04   Mean   :197.2   Mean   : 21.33  
##  3rd Qu.:1.0000   3rd Qu.:82.25   3rd Qu.:286.0   3rd Qu.: 24.25  
##  Max.   :2.0000   Max.   :96.00   Max.   :861.0   Max.   :138.00  
##                                                                   
##  MntMeatProducts  MntFishProducts  MntSweetProducts  MntGoldProds   
##  Min.   :   3.0   Min.   :  0.00   Min.   :  0.00   Min.   :  1.00  
##  1st Qu.:  14.5   1st Qu.:  2.00   1st Qu.:  1.50   1st Qu.:  6.75  
##  Median :  35.0   Median :  8.00   Median :  4.00   Median : 17.50  
##  Mean   : 162.7   Mean   : 27.17   Mean   : 30.21   Mean   : 49.25  
##  3rd Qu.: 177.0   3rd Qu.: 40.75   3rd Qu.: 31.75   3rd Qu.: 55.00  
##  Max.   :1607.0   Max.   :164.00   Max.   :263.00   Max.   :362.00  
##                                                                     
##  NumDealsPurchases NumWebPurchases  NumCatalogPurchases NumStorePurchases
##  Min.   : 0.000    Min.   : 0.000   Min.   : 0.000      Min.   : 0.000   
##  1st Qu.: 1.000    1st Qu.: 1.000   1st Qu.: 0.000      1st Qu.: 3.000   
##  Median : 1.500    Median : 2.500   Median : 1.000      Median : 4.000   
##  Mean   : 2.458    Mean   : 4.042   Mean   : 1.833      Mean   : 4.792   
##  3rd Qu.: 3.000    3rd Qu.: 5.250   3rd Qu.: 3.000      3rd Qu.: 7.000   
##  Max.   :12.000    Max.   :27.000   Max.   :10.000      Max.   :12.000   
##                                                                          
##  NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedCmp1
##  Min.   :0.000     0:24         0:21         0:23         0:22        
##  1st Qu.:3.000     1: 0         1: 3         1: 1         1: 2        
##  Median :6.000                                                        
##  Mean   :5.083                                                        
##  3rd Qu.:7.000                                                        
##  Max.   :9.000                                                        
##                                                                       
##  AcceptedCmp2 Complain Response 2n Cycle Basic  Graduation Master PhD   
##  0:24         0:24     0:23     0:21     0:24   0:13       0:19   0:19  
##  1: 0         1: 0     1: 1     1: 3     1: 0   1:11       1: 5   1: 5  
##                                                                         
##                                                                         
##                                                                         
##                                                                         
##                                                                         
##  Divorced Married Single Together Widow 
##  0:24     0:17    0:15   0:17     0:23  
##  1: 0     1: 7    1: 9   1: 7     1: 1  
##                                         
##                                         
##                                         
##                                         
## 
marketing_dat_clean = marketing_dat[!is.na(marketing_dat$Income),] #removing rows that are in the new dataset

Visualization

Principal components analysis

By reducing the complexity of the data through PCA a clear pattern of different subgroups regarding income is visible.

We can see that the first principal component explains around 36% of the variance and the second principal component explains roughly 11%.

marketing_dat_numeric <- marketing_dat_clean[-c(18:34)] # pca works best on numeric data, so we extract categorical data


marketing_pca <- prcomp(marketing_dat_numeric,
                        center = TRUE,
                        scale = TRUE)
summary(marketing_pca)
## Importance of components:
##                           PC1    PC2     PC3    PC4     PC5     PC6     PC7
## Standard deviation     2.4653 1.3852 1.09516 1.0242 0.97824 0.91588 0.87338
## Proportion of Variance 0.3575 0.1129 0.07055 0.0617 0.05629 0.04934 0.04487
## Cumulative Proportion  0.3575 0.4704 0.54092 0.6026 0.65891 0.70826 0.75313
##                           PC8    PC9    PC10    PC11    PC12    PC13    PC14
## Standard deviation     0.8226 0.7973 0.70020 0.67415 0.65205 0.62593 0.59663
## Proportion of Variance 0.0398 0.0374 0.02884 0.02673 0.02501 0.02305 0.02094
## Cumulative Proportion  0.7929 0.8303 0.85916 0.88590 0.91091 0.93395 0.95489
##                           PC15    PC16    PC17
## Standard deviation     0.55275 0.49517 0.46487
## Proportion of Variance 0.01797 0.01442 0.01271
## Cumulative Proportion  0.97287 0.98729 1.00000
marketing_dat_numeric$Income_binary <- (marketing_dat_clean$Income > median(marketing_dat_clean$Income)) * 1

We add another column “Income_binary” which splits the sample into two groups. One group has an income smaller than the median and the second group has an income which is higher than the median.

breaks <- quantile(marketing_dat_numeric$Income, probs = seq(0,1, 0.2))


marketing_dat_numeric$Income_quantiles <- as.numeric(as.character(cut(marketing_dat_numeric$Income,
                                                                    breaks = unique(breaks),
                                                                    right = FALSE,
                                                                    include.lowest = TRUE,
                                                                    labels =1:(length(unique(breaks))-1))))

We add another column “Income_quantiles” and split the data into 5 equally sized groups, where the first group contains the lowest 20% of income, the second group the 20% of income thereafter and so on.

When reducing the complexity of the dataset and plotting the dataset on the first two principal components, datapoints close to each other can be interpreted as similar. Below we can see that datapoints further on the left tend to be in a higher quantile.

marketing_pca_plot_quant <- autoplot(marketing_pca,
                               data = marketing_dat_numeric,
                               colour = "Income_quantiles")
marketing_pca_plot_quant

Plotting the principal component against the binary classification shows a similar picture. On the right-hand side, there are almost only datapoints below the median and on the left-hand side there are almost only datapoints above the median.

marketing_pca_plot_bin <- autoplot(marketing_pca,
                               data = marketing_dat_numeric,
                               colour = "Income_binary")
marketing_pca_plot_bin

Correlation matrix

marketing_dat_clean_cat_num <- marketing_dat_clean
for(i in 1:34){ #we make sure all data is understood as a numeric
  marketing_dat_clean[,i]<-as.numeric(as.character(marketing_dat_clean[,i]))
}

Looking at the data we realised that our column names were too long for visualization purposes and thus created a matrix where each column name has a corresponding short form with a maximum length of 3 characters.

real_names<-colnames(marketing_dat_clean)
short_names<-c("ID","Age","Inc","KH","TH","Rec","MW","MFr","MMP","MFP","MSP","MGP","NDP","NWP","NCP","NSP","NWV","AC3","AC4","AC5","AC1","AC2","Com","Res","2nC","Bas","Grad","Mas","PhD","Div","Mar","Sgl","Tog","Wid")
decifer_matrix<-cbind(real_names,short_names) 
decifer_matrix
##       real_names            short_names
##  [1,] "ID"                  "ID"       
##  [2,] "Age"                 "Age"      
##  [3,] "Income"              "Inc"      
##  [4,] "Kidhome"             "KH"       
##  [5,] "Teenhome"            "TH"       
##  [6,] "Recency"             "Rec"      
##  [7,] "MntWines"            "MW"       
##  [8,] "MntFruits"           "MFr"      
##  [9,] "MntMeatProducts"     "MMP"      
## [10,] "MntFishProducts"     "MFP"      
## [11,] "MntSweetProducts"    "MSP"      
## [12,] "MntGoldProds"        "MGP"      
## [13,] "NumDealsPurchases"   "NDP"      
## [14,] "NumWebPurchases"     "NWP"      
## [15,] "NumCatalogPurchases" "NCP"      
## [16,] "NumStorePurchases"   "NSP"      
## [17,] "NumWebVisitsMonth"   "NWV"      
## [18,] "AcceptedCmp3"        "AC3"      
## [19,] "AcceptedCmp4"        "AC4"      
## [20,] "AcceptedCmp5"        "AC5"      
## [21,] "AcceptedCmp1"        "AC1"      
## [22,] "AcceptedCmp2"        "AC2"      
## [23,] "Complain"            "Com"      
## [24,] "Response"            "Res"      
## [25,] "2n Cycle"            "2nC"      
## [26,] "Basic"               "Bas"      
## [27,] "Graduation"          "Grad"     
## [28,] "Master"              "Mas"      
## [29,] "PhD"                 "PhD"      
## [30,] "Divorced"            "Div"      
## [31,] "Married"             "Mar"      
## [32,] "Single"              "Sgl"      
## [33,] "Together"            "Tog"      
## [34,] "Widow"               "Wid"

Based on the data with the new column names we will now try to plot a correlation plot.

marketing_dat_clean_cor<-marketing_dat_clean
colnames(marketing_dat_clean_cor)<-short_names
corrplot.mixed(cor(marketing_dat_clean_cor[,2:34]),title = "Correlation Matrix of all Variables",lower="shade",upper="color",mar=c(0,0,2,0))

As we can see our data still has to many dimensions for us to clearly be able to interpret the plot. Therfore, we will try and plot only the correlations with income, thus decreasing our column dimension.

data_cor <- cor(marketing_dat_clean_cor[ , colnames(marketing_dat_clean_cor) != "Inc"],marketing_dat_clean_cor$Inc)

corrplot(data_cor,title = "Correlation Matrix of Income with all Variables",method = 'color',mar=c(0,0,2,0))

This already leads to a clearer picture but nonetheless has some flaws to it. That’s why we will now plot a correlation matrix of income with the other variables, only containing those variables which have a absolute correlation value above 0.2. This should leave us with a correlation matrix with decreased rows and columns and thus a more straight forward interpretable picture.

for(i in 1:length(data_cor)){
  #print(data_cor[i])
  if(abs(data_cor[i])<=0.2){
    data_cor[i]<-NA
  }
  else{
    data_cor[i]<-data_cor[i]
  }
    
}
df<-data.frame(matrix(data_cor,length(data_cor)))
rownames(df)<-rownames(data_cor)
colnames(df)<-c("corr_values")
df1<-na.omit(df)
df2<-matrix(df1$corr_values,14,1,byrow=TRUE)

rel_names=c(1:14)
rel_shortnames=rownames(df1)
print(rel_shortnames)
##  [1] "KH"  "MW"  "MFr" "MMP" "MFP" "MSP" "MGP" "NWP" "NCP" "NSP" "NWV" "AC5"
## [13] "AC1" "Bas"
for(i in 1:34){
  for (j in 1:length(rel_shortnames)) {
    if(rel_shortnames[j] == decifer_matrix[i,2]){
      rel_names[j]==decifer_matrix[i,1]
      #print(rel_shortnames[j])
    }
    #print(i)
  }
}
rownames(df2)<-rel_shortnames
colnames(df2)<-c("Inc")
corrplot(df2,title = "Correlation of Income with variables where abs(cor>0.2)",method = 'color',mar=c(0,2,2,2))

This already looks a lot better. First we replaced the values with an absolute correlation below 0.2 with “NA”s which we then omitted. After omitting we then took the corresponding column names and ended up with this correlation matrix. With use of the previously printed decifer matrix we can now interpret the variables. Some of the correlations might not be that straight forward but it does appear reasonable that Gold purchases and Wine purchases correlate positively with Income while Basic education has a negative effect (seeing as we do not have the category no education so basic education is the lowest education level).

Finally, we are going to plot income with all other variables to look for visual correlations or interesting effects. For the purpose of visualisation we excluded a customer whoms income was equal to “666666” which is way above all others and thus made the graphs uninterpretable.

plot(Income~.,data = subset(marketing_dat_clean, Income<max(Income),col="blue"))

Some of the graphs show slight graphical correlations, but whether or not these are significant we will show in the following.

Comparing the data via classification approaches

Despite the variable of interest(Income) being a quantitative variable, we also decided to look at some classification approaches, since, depending on the level of accuracy of the prediction needed, they might also be a valid approach.

Performing a k-NN model

In a first step we performed a stepwise regression to see which variables to include in the k-NN model.

Next, we created two additional variables “Income_binary” and “Income_quantiles” for the “marketing_dat_clean” dataset, since for the k-NN we will include the one-hot encoding of the categorical variables.

marketing_dat_clean$Income_binary <- (marketing_dat_clean$Income > median(marketing_dat_clean$Income)) * 1

breaks <- quantile(marketing_dat_clean$Income, probs = seq(0,1, 0.2))

marketing_dat_clean$Income_quantiles <- as.numeric(as.character(cut(marketing_dat_clean$Income,
                                                                    breaks = unique(breaks),
                                                                    right = FALSE,
                                                                    include.lowest = TRUE,
                                                                    labels =1:(length(unique(breaks))-1))))

For assessing the predictive power of the models, we set up an out of sample exercise. The data set is split into a training and a test sample: 80% train sample vs 20% test sample.

set.seed(1234)
n <- nrow(marketing_dat_clean)
n1 <- floor(0.8 * n)
id_train <- sample(1:n, n1)

train <- marketing_dat_clean[id_train, ]
test  <- marketing_dat_clean[-id_train, ]

K-NN model with a binary split

Then, we used the caret package to perform a k-NN model on our training data.

# Out-of-sample performance with the caret package
fitControl <- trainControl(method = "cv", number = 5)

set.seed(12345)

knnFit_scaled <- train(factor(Income_binary) ~ (Age) +  (Kidhome) + (Teenhome) + (Recency) + (MntWines) + (MntFruits) + (MntMeatProducts) + (MntSweetProducts) + (NumDealsPurchases) + (NumWebPurchases) + (NumCatalogPurchases) + (NumStorePurchases) + (NumWebVisitsMonth) + AcceptedCmp3 +(AcceptedCmp4) + (AcceptedCmp5) + (AcceptedCmp1) + Basic + PhD, 
                       data = train, 
                       method = "knn", 
                       trControl = fitControl, 
                       tuneGrid = data.frame(k = c(2,3,4,5)),
                       preProcess = c("center", "scale"))# here we can specify if we want scaling(X-mean(x))/sd(X)

Using the model generated above, we see that it classifies our testing data with an accuracy of around 90%.

table(predict(knnFit_scaled, test))
## 
##   0   1 
## 216 228
confusionMatrix(predict(knnFit_scaled, test), factor(test$Income_binary))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 190  27
##          1  17 210
##                                           
##                Accuracy : 0.9009          
##                  95% CI : (0.8693, 0.9271)
##     No Information Rate : 0.5338          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8015          
##                                           
##  Mcnemar's Test P-Value : 0.1748          
##                                           
##             Sensitivity : 0.9179          
##             Specificity : 0.8861          
##          Pos Pred Value : 0.8756          
##          Neg Pred Value : 0.9251          
##              Prevalence : 0.4662          
##          Detection Rate : 0.4279          
##    Detection Prevalence : 0.4887          
##       Balanced Accuracy : 0.9020          
##                                           
##        'Positive' Class : 0               
## 

K-NN model split into quantiles

Then, we classified the data into quantiles. In a first step we had to scale our data.

train_scale <- scale(train)
test_scale <- scale(test)

Then, we performed the classification for k = 1.

classifier_knn <- knn(train = train_scale,
                      test = test_scale,
                      cl = train$Income_quantiles,
                      k = 1)
confusion_matrix <- table(test$Income_quantiles, classifier_knn)
confusion_matrix
##    classifier_knn
##      1  2  3  4  5
##   1 57 16  1  0  0
##   2 29 61  5  0  0
##   3  4 16 54 11  2
##   4  0  1 18 64 13
##   5  2  0  3 20 67
misClassError <- mean(classifier_knn != test$Income_quantiles)
print(paste("Accuracy =", 1-misClassError))
## [1] "Accuracy = 0.682432432432432"

For k = 1 the accuracy is around 68%.

classifier_knn <- knn(train = train_scale,
                      test = test_scale,
                      cl = train$Income_quantiles,
                      k = 3)
confusion_matrix <- table(test$Income_quantiles, classifier_knn)
confusion_matrix
##    classifier_knn
##      1  2  3  4  5
##   1 54 20  0  0  0
##   2 20 64 10  1  0
##   3  4 20 52  7  4
##   4  0  0 21 64 11
##   5  0  0  4 24 64
misClassError <- mean(classifier_knn != test$Income_quantiles)
print(paste("Accuracy =", 1-misClassError))
## [1] "Accuracy = 0.671171171171171"

For k = 3 the accuracy is around 67%.

classifier_knn <- knn(train = train_scale,
                      test = test_scale,
                      cl = train$Income_quantiles,
                      k = 5)
confusion_matrix <- table(test$Income_quantiles, classifier_knn)
confusion_matrix
##    classifier_knn
##      1  2  3  4  5
##   1 51 23  0  0  0
##   2 26 61  7  1  0
##   3  4 14 60  8  1
##   4  0  1 24 58 13
##   5  0  1  5 21 65
misClassError <- mean(classifier_knn != test$Income_quantiles)
print(paste("Accuracy =", 1-misClassError))
## [1] "Accuracy = 0.664414414414414"

For k=5 the accuracy is around 66%.

As shown above the accuracy for classification into quantiles is lower in comparison to the classification into a binary class which is logical, since a classification into quantiles is more complex and, therefore, more prone to error. On top of that, we can see that k = 1 delivers the best result with the highest accuracy.

Performing a naive bayes classification

nbFit <- train(factor(Income_binary) ~ (Age) +  (Kidhome) +  (Teenhome) + (Recency) + (MntWines) + (MntFruits) + (MntMeatProducts) + (MntSweetProducts) + (NumDealsPurchases) + (NumWebPurchases) + (NumCatalogPurchases) + (NumStorePurchases) + (NumWebVisitsMonth) + AcceptedCmp3 +(AcceptedCmp4) +  (AcceptedCmp5) + (AcceptedCmp1) + Basic + PhD, 
                       data = train, 
                       method = "naive_bayes", 
                       trControl = fitControl)
confusionMatrix(predict(nbFit, test), factor(test$Income_binary))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 194  37
##          1  13 200
##                                           
##                Accuracy : 0.8874          
##                  95% CI : (0.8542, 0.9153)
##     No Information Rate : 0.5338          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7754          
##                                           
##  Mcnemar's Test P-Value : 0.001143        
##                                           
##             Sensitivity : 0.9372          
##             Specificity : 0.8439          
##          Pos Pred Value : 0.8398          
##          Neg Pred Value : 0.9390          
##              Prevalence : 0.4662          
##          Detection Rate : 0.4369          
##    Detection Prevalence : 0.5203          
##       Balanced Accuracy : 0.8905          
##                                           
##        'Positive' Class : 0               
## 

As shown above, we can see, that the naive bayes approach performs not as good in terms of accuracy as the K-NN model. Therefore, the K-NN model would be advisable.

Linear regression

To perform a linear regression we have to modify our dataframe with our marketing data. To avoid multicolliniarity between dummy variables we have to delete one of every group. So we delete one from education status (2n Cycle), then one about the martial status (Single) and we remove ID which is a random number.

marketing_dat_clean_reg = subset(marketing_dat_clean_cat_num, select = -c(ID, `2n Cycle`, Single))

Now we will split the data set into training and test one, and perform a linear regression on the test sample.

n <- nrow(marketing_dat_clean)
n1 <- floor(n*0.9) # number of obs in train set
set.seed(1234) ## for reproducibility
id_train <- sample(1:n, n1)
train_dat <- marketing_dat_clean_reg[id_train, ]
test_dat  <- marketing_dat_clean_reg[-id_train, ]

Here we are performing a linear regression with all variables:

linear_all = lm( Income ~., data = train_dat)
summary(linear_all)
## 
## Call:
## lm(formula = Income ~ ., data = train_dat)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -78876  -5932   -164   5136 629420 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          4.772e+04  5.564e+04   0.858 0.391160    
## Age                  5.021e-01  2.840e+01   0.018 0.985896    
## Kidhome              2.047e+03  9.983e+02   2.051 0.040421 *  
## Teenhome             5.244e+03  8.933e+02   5.870 5.10e-09 ***
## Recency             -2.032e+01  1.386e+01  -1.466 0.142712    
## MntWines             1.441e+01  2.171e+00   6.640 4.05e-11 ***
## MntFruits            2.373e+01  1.404e+01   1.690 0.091194 .  
## MntMeatProducts      1.642e+01  3.037e+00   5.405 7.27e-08 ***
## MntFishProducts      1.313e+01  1.065e+01   1.233 0.217830    
## MntSweetProducts     2.653e+01  1.309e+01   2.026 0.042887 *  
## MntGoldProds        -2.060e+00  9.374e+00  -0.220 0.826098    
## NumDealsPurchases   -7.092e+02  2.709e+02  -2.618 0.008907 ** 
## NumWebPurchases      1.095e+03  1.979e+02   5.534 3.55e-08 ***
## NumCatalogPurchases  5.161e+02  2.391e+02   2.158 0.031030 *  
## NumStorePurchases    4.907e+02  1.911e+02   2.568 0.010305 *  
## NumWebVisitsMonth   -2.978e+03  2.381e+02 -12.509  < 2e-16 ***
## AcceptedCmp31       -2.081e+03  1.611e+03  -1.292 0.196560    
## AcceptedCmp41        3.646e+03  1.805e+03   2.020 0.043541 *  
## AcceptedCmp51        3.137e+03  1.967e+03   1.595 0.110942    
## AcceptedCmp11        3.227e+03  1.907e+03   1.692 0.090744 .  
## AcceptedCmp21        1.402e+03  3.671e+03   0.382 0.702614    
## Complain1           -1.239e+03  4.165e+03  -0.298 0.766067    
## Response1           -4.120e+02  1.328e+03  -0.310 0.756447    
## Basic1              -1.075e+04  2.909e+03  -3.694 0.000227 ***
## Graduation1          2.110e+03  1.428e+03   1.478 0.139552    
## Master1              1.914e+03  1.669e+03   1.147 0.251583    
## PhD1                 3.139e+03  1.635e+03   1.919 0.055067 .  
## Divorced1            1.364e+03  1.493e+03   0.914 0.360977    
## Married1             6.464e+01  1.076e+03   0.060 0.952113    
## Together1            1.393e+03  1.166e+03   1.195 0.232382    
## Widow1               4.327e+02  2.365e+03   0.183 0.854827    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17480 on 1963 degrees of freedom
## Multiple R-squared:  0.5331, Adjusted R-squared:  0.5259 
## F-statistic:  74.7 on 30 and 1963 DF,  p-value: < 2.2e-16

As we can see there are many variables where the p-value for the coefficient is above 0,05 and we fail to reject the null hypothesis that they are significant and differ from 0. The \(Adjusted\) \(R^2\) is also not astonishing. We can only explain 54,71% of the income variance with this model.

Displaying coefficients where the p-value is under 0,05 and we can reject the \(H_0\).

which(summary(linear_all)$coefficients[,4] < 0.05)
##             Kidhome            Teenhome            MntWines     MntMeatProducts 
##                   3                   4                   6                   8 
##    MntSweetProducts   NumDealsPurchases     NumWebPurchases NumCatalogPurchases 
##                  10                  12                  13                  14 
##   NumStorePurchases   NumWebVisitsMonth       AcceptedCmp41              Basic1 
##                  15                  16                  18                  24

Creating a model only with significant variables where the p-value < 0.05:

linear_new <- lm(Income ~ Kidhome + Teenhome + MntWines + MntMeatProducts + MntSweetProducts+ NumDealsPurchases+ NumWebPurchases + NumCatalogPurchases + NumStorePurchases + NumWebVisitsMonth + AcceptedCmp4 + Basic, data = train_dat)
summary(linear_new)
## 
## Call:
## lm(formula = Income ~ Kidhome + Teenhome + MntWines + MntMeatProducts + 
##     MntSweetProducts + NumDealsPurchases + NumWebPurchases + 
##     NumCatalogPurchases + NumStorePurchases + NumWebVisitsMonth + 
##     AcceptedCmp4 + Basic, data = train_dat)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -80663  -5927   -221   5330 631432 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          49060.912   1868.065  26.263  < 2e-16 ***
## Kidhome               2109.484    979.128   2.154  0.03132 *  
## Teenhome              5042.397    853.185   5.910 4.02e-09 ***
## MntWines                16.186      1.975   8.196 4.40e-16 ***
## MntMeatProducts         18.616      2.915   6.387 2.10e-10 ***
## MntSweetProducts        37.585     12.050   3.119  0.00184 ** 
## NumDealsPurchases     -818.558    266.661  -3.070  0.00217 ** 
## NumWebPurchases       1113.135    193.331   5.758 9.86e-09 ***
## NumCatalogPurchases    538.775    231.042   2.332  0.01980 *  
## NumStorePurchases      519.712    182.039   2.855  0.00435 ** 
## NumWebVisitsMonth    -3103.025    232.444 -13.350  < 2e-16 ***
## AcceptedCmp41         4606.402   1690.015   2.726  0.00647 ** 
## Basic1              -12631.026   2647.410  -4.771 1.97e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17490 on 1981 degrees of freedom
## Multiple R-squared:  0.5279, Adjusted R-squared:  0.5251 
## F-statistic: 184.6 on 12 and 1981 DF,  p-value: < 2.2e-16

As we can see now all coefficients are significant.

Now we will do a step-wise backward selection, to exclude some insignificant variables from our model:

step(linear_all, direction = "backward")

The model with the lowest AIC looks as follows.

linear_step <- lm(formula = Income ~ Kidhome + Teenhome + Recency + MntWines + 
    MntFruits + MntMeatProducts + MntSweetProducts + NumDealsPurchases + 
    NumWebPurchases + NumCatalogPurchases + NumStorePurchases + 
    NumWebVisitsMonth + AcceptedCmp3 + AcceptedCmp4 + AcceptedCmp5 + 
    AcceptedCmp1 + Basic, data = train_dat)
summary(linear_step)
## 
## Call:
## lm(formula = Income ~ Kidhome + Teenhome + Recency + MntWines + 
##     MntFruits + MntMeatProducts + MntSweetProducts + NumDealsPurchases + 
##     NumWebPurchases + NumCatalogPurchases + NumStorePurchases + 
##     NumWebVisitsMonth + AcceptedCmp3 + AcceptedCmp4 + AcceptedCmp5 + 
##     AcceptedCmp1 + Basic, data = train_dat)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -79670  -5791   -169   5087 630511 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          49447.211   1996.321  24.769  < 2e-16 ***
## Kidhome               2074.239    980.874   2.115  0.03458 *  
## Teenhome              5363.726    858.027   6.251 4.98e-10 ***
## Recency                -19.235     13.449  -1.430  0.15281    
## MntWines                15.037      2.110   7.126 1.45e-12 ***
## MntFruits               25.707     13.438   1.913  0.05590 .  
## MntMeatProducts         16.786      2.975   5.642 1.93e-08 ***
## MntSweetProducts        27.540     12.695   2.169  0.03017 *  
## NumDealsPurchases     -755.104    267.607  -2.822  0.00482 ** 
## NumWebPurchases       1105.704    193.816   5.705 1.34e-08 ***
## NumCatalogPurchases    547.172    235.179   2.327  0.02009 *  
## NumStorePurchases      501.059    187.133   2.678  0.00748 ** 
## NumWebVisitsMonth    -3015.978    234.746 -12.848  < 2e-16 ***
## AcceptedCmp31        -2245.765   1558.117  -1.441  0.14965    
## AcceptedCmp41         3605.268   1748.344   2.062  0.03933 *  
## AcceptedCmp51         2891.638   1914.210   1.511  0.13105    
## AcceptedCmp11         3220.306   1860.601   1.731  0.08365 .  
## Basic1              -12682.112   2642.161  -4.800 1.71e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17460 on 1976 degrees of freedom
## Multiple R-squared:  0.5312, Adjusted R-squared:  0.5272 
## F-statistic: 131.7 on 17 and 1976 DF,  p-value: < 2.2e-16

As we can see now, the model with the lowest AIC has some coefficient are are not significant where the p-value is even above 0.15.

Now we compare all 3 models in terms of AIC:

c(AIC(linear_all), AIC(linear_new), AIC(linear_step))
## [1] 44649.02 44634.93 44631.04

As we can see the lowest AIC is for Linear model created with step-wise procedure. We plot residuals for the step model to see if they are normally distributed

plot(linear_step$residuals)

hist(linear_step$residuals)

As we can see there are some big outliers so we will try to create log-linear model where the income will be in log.

linear_all_log <- lm(log(Income) ~ ., data = train_dat)
AIC(linear_all_log)
## [1] 696.786

The AIC for the log model is much lower then for the normal ones because we now operate in a different scale so comparing the goodness of those model with this indicator is useless.

Creating an log-linear model with step-wise method:

step(linear_all_log, direction = "backward")

Creating the best log-lin model in terms of AIC

linear_step_log <- lm(formula = log(Income) ~ Age + Kidhome + Teenhome + MntWines + 
    MntFruits + MntMeatProducts + MntFishProducts + MntSweetProducts + 
    NumDealsPurchases + NumWebPurchases + NumStorePurchases + 
    NumWebVisitsMonth + AcceptedCmp4 + Response + Basic + Graduation + 
    Master + PhD, data = train_dat)

We will test the model using the train sample and predicting the outcomes and comparing them with the real values.

y_hat_step <- predict( linear_step, newdata = test_dat) # predictions

y_hat_new <- predict( linear_new, newdata = test_dat) # predictions

y_hat_step_log <- predict( linear_step_log, newdata = test_dat) # predictions

y_hat_all <- predict( linear_all, newdata = test_dat) # predictions

Now we calculate the mean squared error for all those models:

mse_df <- data.frame( method = c( "mean sqared error of step-wise model", "mean squared error of step log model", "mean squared error of full model", "mean squared error of only significant coefficients model") , MSE_value = c( sqrt(c(mean((y_hat_step - test_dat$Income)^2), 
       mean(( exp(y_hat_step_log) - test_dat$Income)^2), mean((y_hat_all - test_dat$Income)^2), mean((y_hat_new - test_dat$Income)^2) ))) )
                      
mse_df
##                                                      method MSE_value
## 1                      mean sqared error of step-wise model  11555.26
## 2                      mean squared error of step log model  16467.42
## 3                          mean squared error of full model  11684.60
## 4 mean squared error of only significant coefficients model  11542.41

As we can see, despite the fact that the AIC was the lowest for step-model, the restricted model with only significant coefficients is performing the best when we want to predict real values from a our data set. The logarithmic model is terrible in comparison to other models.

So we can assume that the best model is the most restricted one with the smallest amount of coefficients. It means that only: Kidhome , Teenhome , MntWines , MntMeatProducts , MntSweetProducts, NumDealsPurchases, NumWebPurchases , NumCatalogPurchases , NumStorePurchases , NumWebVisitsMonth, AcceptedCmp4 , Basic is relevant when we want to predict the income.

Regression trees

We built some regression trees to see whether they capture the effect of the different variables on income succesfully. First, we built a tree using \(rpart\), but as this already includes some cost complexity pruning, we decided to build a full tree using all variables, and prune it back ourselves with a complexity parameter that we find most optimal. We found an elbow point at 4 splits, so we built a pruned tree with these 4 splits.

marketing_dat_clean <- marketing_dat_clean_cat_num[1:34]
rt <- rpart(Income ~ ., data = marketing_dat_clean) #using all variables
rpart.plot(rt, extra = 101, main = "Rpart Tree") #This already includes some cost complexity pruning

rt_full <- rpart(Income ~ ., data = marketing_dat_clean, control = list(cp = 0)) #full tree with a complexity parameter of 0
#rpart.plot(rt_full, extra = 101) #this cannot really be plotted
#printcp(rt_full) 
#rsq.rpart(rt_full) #elbow point at around 4 splits, the cp of this is 1.9733e-02
rt_pruned <- prune(rt_full, cp = 1.9733e-02 ) #prune the tree back with optimal cp
rpart.plot(rt_pruned, main = "Pruned Tree")

In our pruned tree, the most important variables are how much one has spent on wines in the past 2 year, the number of web visits per month, and how moch one spent on meat products.

Random forests

Using the \(randomForest\) package, we built a random forest model from out data.

colnames(marketing_dat_clean)[25] <- "twoncycle"
colnames(marketing_dat_noincome)[25] <- "twoncycle" #renaming the variable because the randomforest could not handle it
rrf <- randomForest(Income ~ ., 
                    data = marketing_dat_clean,
                    importance = TRUE)

rrf
## 
## Call:
##  randomForest(formula = Income ~ ., data = marketing_dat_clean,      importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 11
## 
##           Mean of squared residuals: 279804448
##                     % Var explained: 55.82
importance(rrf)
##                         %IncMSE IncNodePurity
## ID                  -1.43730665   43708668369
## Age                  0.03251331   51127700361
## Kidhome              1.88008877    7955003997
## Teenhome             3.59036325   20909968167
## Recency              2.41967127   34384268125
## MntWines             5.61106214  276716947034
## MntFruits            7.14409757   95090404456
## MntMeatProducts      8.32992232  229540491862
## MntFishProducts      2.38590230   33340273530
## MntSweetProducts     3.75475189   44227736137
## MntGoldProds         2.76393446   22895942782
## NumDealsPurchases    4.27972416   68285564017
## NumWebPurchases      4.31638706   22611673721
## NumCatalogPurchases  6.27788384  117889829776
## NumStorePurchases    3.20246653   83824599619
## NumWebVisitsMonth   13.43273829   98940222566
## AcceptedCmp3         4.58936536     799895281
## AcceptedCmp4         5.73548312     957596346
## AcceptedCmp5        22.19189488   11121299869
## AcceptedCmp1         9.56113235    2226548798
## AcceptedCmp2         3.51327985     307405649
## Complain            -1.99317782     127180757
## Response             8.91466870    1338226781
## twoncycle            3.85488813     845698452
## Basic                0.34109481    1415063658
## Graduation           1.75456977    2460007893
## Master               1.07935245    1068372070
## PhD                 -1.15579367    1820815331
## Divorced             1.03531361    1017540606
## Married              0.26035290    2584953269
## Single               1.40264616    1912666238
## Together            -1.41658694   15344420438
## Widow                6.95327496     700286823
varImpPlot(rrf)

The most important variables for minimising the residual sum of squares are also the MntWines, MntMeatProducts, NumWebVisitsMonth, just like in our pruned regression tree.

Model comparison

We used the \(caret\) package to compare our models using the training data.

set.seed(12345)
fitControl <- trainControl(## 10-fold CV
  method = "cv",
  number = 10)

set.seed(12345)
lmFit1 <- train(Income ~ Kidhome + Teenhome + MntWines + MntMeatProducts + MntSweetProducts+ NumDealsPurchases+
                NumWebPurchases + NumCatalogPurchases + NumStorePurchases + NumWebVisitsMonth + AcceptedCmp4 + Basic,
                data = marketing_dat_clean, 
                method = "lm", 
                trControl = fitControl)

lmFit2 <- train(Income ~ .,
                data = marketing_dat_clean, 
                method = "lm", 
                trControl = fitControl)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
rpartFit1 <- train(Income ~ ., 
                   data = marketing_dat_clean, 
                   method = "rpart", 
                   trControl = fitControl)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
rfFit1 <- train(Income ~ ., 
                data = marketing_dat_clean, 
                method = "rf", 
                tuneGrid = data.frame(mtry = 3),
                trControl = fitControl)

lmFit1
## Linear Regression 
## 
## 2216 samples
##   12 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1993, 1994, 1993, 1995, 1993, 1994, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   13473.09  0.7146677  7347.667
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
lmFit2
## Linear Regression 
## 
## 2216 samples
##   33 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1993, 1994, 1995, 1994, 1995, 1993, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   13909.66  0.6936883  7418.957
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
rpartFit1
## CART 
## 
## 2216 samples
##   33 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1993, 1994, 1994, 1995, 1995, 1995, ... 
## Resampling results across tuning parameters:
## 
##   cp          RMSE      Rsquared   MAE     
##   0.04558861  17561.63  0.5027732  10241.56
##   0.06376280  17858.02  0.4812518  10901.44
##   0.36046053  23913.73  0.0731419  17446.52
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.04558861.
rfFit1
## Random Forest 
## 
## 2216 samples
##   33 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1995, 1994, 1994, 1995, 1996, 1995, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   13248.77  0.7291065  6635.649
## 
## Tuning parameter 'mtry' was held constant at a value of 3

Using all of our models it turns out, that in the training dataset, the random forest model did the best, because it has the highest \(R^2\) and the lowest RMSE.

However, we also wanted to test the models on a test dataset, so we split our data into 80% training data and 20% test data, and compared the models on the test set.

n <- nrow(marketing_dat_clean)
n1 <- floor(2216*0.8) # number of observations in train set
set.seed(12345)
id_train <- sample(1:n, n1)
train_dat <- marketing_dat_clean[id_train, ]
test_dat  <- marketing_dat_clean[-id_train, ]

y_hat_lmfit1 <- predict(lmFit1, newdata = test_dat)
y_hat_lmfit2 <- predict(lmFit2, newdata = test_dat) 
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
y_hat_rpartfit1 <- predict(rpartFit1, newdata = test_dat) 
y_hat_rffit1 <- predict(rfFit1, newdata = test_dat) 

sqrt(c(mean((y_hat_lmfit1 - test_dat$Income)^2), 
       mean((y_hat_lmfit2 - test_dat$Income)^2),
       mean((y_hat_rpartfit1 - test_dat$Income)^2),
       mean((y_hat_rffit1 - test_dat$Income)^2)))
## [1] 31565.56 31420.83 32672.96 19876.74
min(sqrt(c(mean((y_hat_lmfit1 - test_dat$Income)^2), 
       mean((y_hat_lmfit2 - test_dat$Income)^2),
       mean((y_hat_rpartfit1 - test_dat$Income)^2),
       mean((y_hat_rffit1 - test_dat$Income)^2))))
## [1] 19876.74

Comparing on the test data, still the random forest model has the lowest mean squared error, so we concluded to use this for predicting our missing income variables.

Conclusion

marketing_dat_noincome$Income <- predict(rrf, marketing_dat_noincome) #using the random forest model to predict variables
marketing_dat_noincome$Income
##  [1]  28561.02  85884.83  49689.80  41679.29  28326.17  30049.09  53517.49
##  [8]  35249.15  74217.80  55742.50  64942.32  74926.11  62958.78  52387.85
## [15]  38392.22  33474.83  35313.96  59536.34  34717.35  39434.31  51317.98
## [22]  48482.10  82141.37 106281.94

Finally, to visualize the differences between the different model predictions, we created a plot where we can see the income predictions of each model. Our final choice is the black line, the random forest model, which is mostly in between the red and the green line, as the regression tree seems to undervalue the income, while linear regression overvalues it.

foresttt <- predict(rfFit1, marketing_dat_noincome)
rparttt <- predict(rpartFit1, marketing_dat_noincome)
lmmm <- predict(lmFit2, marketing_dat_noincome)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
plot(foresttt, ylim=c(25000, 110000), type='o', ylab = "Predicted Income")
points(rparttt, col='red', type='o')
points(lmmm, col='green', type='o')
legend("topright", c("Random Forest", "Regression Tree", "Linear Regression"), col=c("black","red","green"), pch=1)